org 100h   ; assume ax=bx=0 cx=0xff

;UNPACK:   ; ax=0x200f bx=cx=0 si=0x1dc di=0x355

  pop di   ; di=sp=0

;Prepare floating-point constants for SSE
;[0xfff0]=0xffe00000, [0xffe0]=0xffc00000, ... [0x8000]=0
;step 0x00200000: ... 1 1.25 1.5 1.75  2 2.5 3 3.5  4 5 6 7 ...

PREP:
  sub bx,8
  mov ax,bx
  and al,0xe0
  push ax  ; store the same constant four times
  push di  ; 0
  jnz PREP ; loop 8189 times -> 32kB; sp=0x800c bx=0x0018 ax=0

;  mov al,13h
;  int 10h
  push 0xa000
  pop es
  fninit
  fild word[si]     ;| t=t0

;Palette: Luminance * Hue: diffuse = L*[0.2,H,1], specular = L^9 / 2
;  mov dx,3c8h
;  xor ax,ax
;  out dx,al
;  inc dx
;PAL:
;  or ax,0b1100001111000011  ; bx = 11LLLL11 11HHHH11
;  push ax
;Q out dx,al
;  imul ah
;  shr ax,7
;  loop Q
;  pop ax
;  inc ax
;  jnz PAL

%define K(x) 0x8000 + 0x10*(x/0x20)
%define K_TRANSLATION       K(0xbe80)  ; -0.25
%define K_TRANSLATION_SCALE K(0x3e80)  ; 0.25
%define K_TIME_DELTA        K(0x3c40)  ; 0.75/64

;%define K_EPS               K(0x3c80)  ; 0x3c60=0.875/64, 0x3c80=1/64
;%define K_BRIGHT_MAGIC      K(0x4640)  ; ulp=EPS/16 = 1/1024 => magic=1.5*2^13
;%define K_EPS               K(0x3ce0)  ; 0x3ce0=0.875/32
%define K_EPS               K(0x3d00)  ; 0x3d00=1/32
%define K_BRIGHT_MAGIC      K(0x46c0)  ; ulp=EPS/16 = 1/512 => magic=1.5*2^14

%define K_HUE_MAGIC         K(0x4940)  ; ulp=1/16   = 1/16  => magic=1.5*2^19
;%define K_HUE_MAGIC         K(0x48c0)  ; ulp=2/16   = 1/32  => magic=1.5*2^18

%define K_NEG_ABS           K(0x8000)  ; -0: sign bit for -abs
%define K_MINUS_1           K(0xbf80)  ; -1
%define K_1                 K(0x3f80)  ; 1

;For 16:9 screens: pixel aspect ratio = 1.03
%define K_X_SCALE           K(0x3020)  ; 2.5 * 2**-32: x -> ..1.25
%define K_Y_SCALE           K(0x2fe0)  ; 1.75 * 2**-32: y -> ..0.6836

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE      K(0x3000)  ; 2.0 * 2**-32: x -> ..1.0
;%define K_Y_SCALE      K(0x3000)  ; 2.0 * 2**-32: y -> ..0.7813

;For each frame: prepare rotation constants
M fadd dword[K_TIME_DELTA] ;| t+=dt
  fld st0
  fsincos          ;| C1 S1 t
  fldl2e
  fmul st3         ;| 1.44*t C1 S1 t
  fsincos          ;| C2 S2 C1 S1 t
  fldlg2
  fmul st5         ;| 0.30*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t
;  fldlg2 ;0.30103
;  fldln2 ;0.69315

;Store each constant four times
  mov bx,0x420     ; bh=4
STORE:
  mov cl,4
STORE4:
  fst dword[bx]    ;0x400 10 20 30 40 50 60 70 80
  add bl,bh        ;   XY    C3 S3 C2 S2 C1 S1 scratch
  loop STORE4
  fstp st0
  jns STORE        ; loop 6 times: bx=0x480

%define COS bx
%define SIN bx+0x10


;For each 4-pixel batch:
X mov cl,4         ; bx=0x480

;Store coordinates for this batch
B mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx-128],ax  ; 0x400
  mov [bx-128+2],dx; 0x402

;Combine brightness and hue from the last batch
  shr bp,1       ; background mask
  sbb ax,ax

;  and ah,[bx]    ; brightness and hue, or 0 (background)
;  and al,[bx+si]
;  aad 16         ; al += 16*ah
;  stosb

  and al,[bx]    ; brightness (1..16) or 0 (background)
  jz Z
  add al,15
Z stosb

  add bl,bh
  loop B         ; di+=4

%define INT_X bx-1  ; x = 2^32 * (-0.5..0.5)
%define INT_Y bx    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)



%define x xmm0 ; XYZ coordinates for iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [c,c/4,0]
%define d xmm7 ; depth

;Trace steps along a ray
  mov cl,24
  movaps d,[K_MINUS_1]; d=-1
  jmp U
T addps d,a     ; d+=map(X,Y,d)
U call MAP
  loop T

;Compute Normal.Z (scaled by ambient occlusion)
  movaps [bx],a ; bx=0x480
  subps d,[K_EPS]
  call MAP      ; a = map(X,Y,d-EPS)
  subps a,[bx]  ; a = map(X,Y,d-EPS) - map(X,Y,d)

;Depth fog: depth -1..0..1 -> color 1..1..0
;  movaps b,[K_1]
;  subps b,d
;  minps b,[K_1]
;  mulps a,b     ; a *= min(1-d,1)

;Reject normals pointing away, clip by the far plane
  subps d,[K_1] ; depth: -1..1 -> -2..-0
  movaps b,a
  andnps b,d
  movmskps ebp,b ; a>=0 and d<0? ok : 0 (background)

;Store brightness and hue
  addps a,[K_BRIGHT_MAGIC] ; shift the value into the lowest float byte
;  addps o,[K_HUE_MAGIC]
  movaps [bx],a    ; 0x480
;  movaps [bx+si],o ; 0x580

;Next pixel
  test di,di
  jnz X

;Esc test, next frame
  in al,0x60
  dec al
  jnz M   ; fallthrough

;Return the box distance to the KIFS fractal
MAP:
  mov bl,0       ; bx=0x480
  movups x,[INT_X]
  cvtdq2ps y,[INT_Y]
  cvtdq2ps x,x
  mulps y,[K_Y_SCALE]
  mulps x,[K_X_SCALE]
  movaps z,d    ; x,y,z = X,Y,depth
  movaps c,[K_TRANSLATION] ; c=-1/4: translation=[-c,-c/4,0]

;  xorps o,o     ; o=0

;  movaps o,c    ; o=1/4: easier to compress

  mov ch,14     ; number of iterations

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20
R movaps b,[COS]; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,[SIN]
  mulps b,z     ; b=Cz
  mulps z,a     ; z=Sz
  mulps a,x     ; a=Sx
  mulps x,[COS] ; x=Cx
  subps a,b     ; a=x'=Sx-Cz
  addps z,x     ; z=z'=Sz+Cx
  movaps x,y    ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20   ; 0x20 | 0x40 | 0x60
  jns R         ; bx=0x480 a=z

;Reflect along X and Y
  movaps b,[K_NEG_ABS]
  orps x,b      ; x=-|x|
  orps y,b      ; y=-|y|

;Box (L_inf) distance to the origin
  orps a,b      ; a=-|z|
  minps a,x
  minps a,y     ; a=-length = min(-|x|,-|y|,-|z|)

;TODO: remove xor length
  xorps a,b     ; a=length

;Orbit trap
;  maxps o,a     ; orbit=max(orbit,length)

;TODO: remove xor length
;  minps o,a     ; orbit=min(orbit,-length)

;Translate by [c,c/4,0]
  movaps b,[K_TRANSLATION_SCALE]
  mulps b,c     ; b=c/4
  subps x,c     ; x-=c
  subps y,b     ; y-=c/4

;Scale translation
  subps c,b     ; c-=c/4 (c*=3/4)

;Next iteration
  dec ch
  jnz L

  addps a,c
  addps a,c     ; a=length-2c

;TODO: remove xor length
;  subps a,c
;  subps a,c     ; a=-(length-2c)

  ret           ; bx=0x480
